{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import featuretools as ft\n",
    "from woodwork.logical_types import Categorical, NaturalLanguage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's load the data again\n",
    "\n",
    "df = pd.read_csv(\"retail.csv\", parse_dates=[\"invoice_date\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create and entity set\n",
    "\n",
    "es = ft.EntitySet(id=\"data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\woodwork\\type_sys\\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\woodwork\\type_sys\\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\woodwork\\type_sys\\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\woodwork\\type_sys\\utils.py:40: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n"
     ]
    }
   ],
   "source": [
    "# Add the data to the entity\n",
    "\n",
    "es = es.add_dataframe(\n",
    "    dataframe=df,              # the dataframe with the data\n",
    "    dataframe_name=\"data\",     # unique name to associate with this dataframe\n",
    "    index=\"rows\",              # column name to index the items\n",
    "    make_index=True,           # if true, create a new column with unique values\n",
    "    time_index=\"invoice_date\", # column containing time data\n",
    "    logical_types={\n",
    "        \"customer_id\": Categorical, # the id is numerical, but should be handled as categorical\n",
    "        \"description\": NaturalLanguage, # we need to set this variable as text for ft to work\n",
    "    },\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Entityset: data\n",
       "  DataFrames:\n",
       "    data [Rows: 741301, Columns: 8]\n",
       "    invoices [Rows: 40505, Columns: 3]\n",
       "  Relationships:\n",
       "    data.invoice -> invoices.invoice"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create a new dataframe with invoices\n",
    "# indicating its relationship to the main data\n",
    "\n",
    "es.normalize_dataframe(\n",
    "    base_dataframe_name=\"data\",     # Datarame name from which to split.\n",
    "    new_dataframe_name=\"invoices\",  # Name of the new dataframe.\n",
    "    index=\"invoice\",                # relationship will be created across this column.\n",
    "    copy_columns=[\"customer_id\"],   # columns to remove from base_dataframe and move to new dataframe.\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Entityset: data\n",
       "  DataFrames:\n",
       "    data [Rows: 741301, Columns: 8]\n",
       "    invoices [Rows: 40505, Columns: 3]\n",
       "    customers [Rows: 5410, Columns: 2]\n",
       "  Relationships:\n",
       "    data.invoice -> invoices.invoice\n",
       "    invoices.customer_id -> customers.customer_id"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create a new dataframe indicating its \n",
    "# relationship to the previous dataframe\n",
    "\n",
    "# now we work with customers\n",
    "\n",
    "es.normalize_dataframe(\n",
    "    base_dataframe_name=\"invoices\",  # note that we use the df from the previous cell\n",
    "    new_dataframe_name=\"customers\",  # the name of the new df\n",
    "    index=\"customer_id\",             # the column that indicates the relationship\n",
    ")\n",
    "\n",
    "es"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# multiple operations simultaneously\n",
    "\n",
    "agg_primitives = [\"mean\", \"max\", \"min\", \"sum\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function min at 0x000001E145EB3AC0> is currently using SeriesGroupBy.min. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"min\" instead.\n",
      "  ).agg(to_agg)\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function sum at 0x000001E145EB3370> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n",
      "  ).agg(to_agg)\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function max at 0x000001E145EB39A0> is currently using SeriesGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"max\" instead.\n",
      "  ).agg(to_agg)\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x000001E14BEDC310> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
      "  ).agg(to_agg)\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function sum at 0x000001E145EB3370> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n",
      "  ).agg(to_agg)\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function max at 0x000001E145EB39A0> is currently using SeriesGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"max\" instead.\n",
      "  ).agg(to_agg)\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function min at 0x000001E145EB3AC0> is currently using SeriesGroupBy.min. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"min\" instead.\n",
      "  ).agg(to_agg)\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x000001E14BEDC310> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
      "  ).agg(to_agg)\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x000001E14BEDC310> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
      "  ).agg(to_agg)\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function sum at 0x000001E145EB3370> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"sum\" instead.\n",
      "  ).agg(to_agg)\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function min at 0x000001E145EB3AC0> is currently using SeriesGroupBy.min. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"min\" instead.\n",
      "  ).agg(to_agg)\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function max at 0x000001E145EB39A0> is currently using SeriesGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"max\" instead.\n",
      "  ).agg(to_agg)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[<Feature: MAX(data.price)>,\n",
       " <Feature: MAX(data.quantity)>,\n",
       " <Feature: MEAN(data.price)>,\n",
       " <Feature: MEAN(data.quantity)>,\n",
       " <Feature: MIN(data.price)>,\n",
       " <Feature: MIN(data.quantity)>,\n",
       " <Feature: SUM(data.price)>,\n",
       " <Feature: SUM(data.quantity)>,\n",
       " <Feature: MAX(invoices.MEAN(data.price))>,\n",
       " <Feature: MAX(invoices.MEAN(data.quantity))>,\n",
       " <Feature: MAX(invoices.MIN(data.price))>,\n",
       " <Feature: MAX(invoices.MIN(data.quantity))>,\n",
       " <Feature: MAX(invoices.SUM(data.price))>,\n",
       " <Feature: MAX(invoices.SUM(data.quantity))>,\n",
       " <Feature: MEAN(invoices.MAX(data.price))>,\n",
       " <Feature: MEAN(invoices.MAX(data.quantity))>,\n",
       " <Feature: MEAN(invoices.MEAN(data.price))>,\n",
       " <Feature: MEAN(invoices.MEAN(data.quantity))>,\n",
       " <Feature: MEAN(invoices.MIN(data.price))>,\n",
       " <Feature: MEAN(invoices.MIN(data.quantity))>,\n",
       " <Feature: MEAN(invoices.SUM(data.price))>,\n",
       " <Feature: MEAN(invoices.SUM(data.quantity))>,\n",
       " <Feature: MIN(invoices.MAX(data.price))>,\n",
       " <Feature: MIN(invoices.MAX(data.quantity))>,\n",
       " <Feature: MIN(invoices.MEAN(data.price))>,\n",
       " <Feature: MIN(invoices.MEAN(data.quantity))>,\n",
       " <Feature: MIN(invoices.SUM(data.price))>,\n",
       " <Feature: MIN(invoices.SUM(data.quantity))>,\n",
       " <Feature: SUM(invoices.MAX(data.price))>,\n",
       " <Feature: SUM(invoices.MAX(data.quantity))>,\n",
       " <Feature: SUM(invoices.MEAN(data.price))>,\n",
       " <Feature: SUM(invoices.MEAN(data.quantity))>,\n",
       " <Feature: SUM(invoices.MIN(data.price))>,\n",
       " <Feature: SUM(invoices.MIN(data.quantity))>]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create all features simultaneously\n",
    "\n",
    "feature_matrix, feature_defs = ft.dfs(\n",
    "    entityset=es,                                 # the entity set\n",
    "    target_dataframe_name=\"customers\",            # the dataframe for wich to create the feature\n",
    "    agg_primitives=agg_primitives,                # the aggregation primitives\n",
    "    trans_primitives=[],                          # empy list to override defo params\n",
    ")\n",
    "\n",
    "# display name of created features\n",
    "feature_defs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MAX(data.price)</th>\n",
       "      <th>MAX(data.quantity)</th>\n",
       "      <th>MEAN(data.price)</th>\n",
       "      <th>MEAN(data.quantity)</th>\n",
       "      <th>MIN(data.price)</th>\n",
       "      <th>MIN(data.quantity)</th>\n",
       "      <th>SUM(data.price)</th>\n",
       "      <th>SUM(data.quantity)</th>\n",
       "      <th>MAX(invoices.MEAN(data.price))</th>\n",
       "      <th>MAX(invoices.MEAN(data.quantity))</th>\n",
       "      <th>...</th>\n",
       "      <th>MIN(invoices.MEAN(data.price))</th>\n",
       "      <th>MIN(invoices.MEAN(data.quantity))</th>\n",
       "      <th>MIN(invoices.SUM(data.price))</th>\n",
       "      <th>MIN(invoices.SUM(data.quantity))</th>\n",
       "      <th>SUM(invoices.MAX(data.price))</th>\n",
       "      <th>SUM(invoices.MAX(data.quantity))</th>\n",
       "      <th>SUM(invoices.MEAN(data.price))</th>\n",
       "      <th>SUM(invoices.MEAN(data.quantity))</th>\n",
       "      <th>SUM(invoices.MIN(data.price))</th>\n",
       "      <th>SUM(invoices.MIN(data.quantity))</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>customer_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>13085.0</th>\n",
       "      <td>830.12</td>\n",
       "      <td>48.0</td>\n",
       "      <td>12.413587</td>\n",
       "      <td>9.076087</td>\n",
       "      <td>0.55</td>\n",
       "      <td>-48.0</td>\n",
       "      <td>1142.05</td>\n",
       "      <td>835.0</td>\n",
       "      <td>830.120000</td>\n",
       "      <td>20.750000</td>\n",
       "      <td>...</td>\n",
       "      <td>1.828571</td>\n",
       "      <td>-15.428571</td>\n",
       "      <td>10.50</td>\n",
       "      <td>-108.0</td>\n",
       "      <td>886.42</td>\n",
       "      <td>209.0</td>\n",
       "      <td>860.134136</td>\n",
       "      <td>78.982882</td>\n",
       "      <td>839.97</td>\n",
       "      <td>-3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13078.0</th>\n",
       "      <td>12.75</td>\n",
       "      <td>300.0</td>\n",
       "      <td>3.961193</td>\n",
       "      <td>14.061988</td>\n",
       "      <td>0.19</td>\n",
       "      <td>-14.0</td>\n",
       "      <td>3386.82</td>\n",
       "      <td>12023.0</td>\n",
       "      <td>12.750000</td>\n",
       "      <td>61.333333</td>\n",
       "      <td>...</td>\n",
       "      <td>0.190000</td>\n",
       "      <td>-14.000000</td>\n",
       "      <td>0.19</td>\n",
       "      <td>-15.0</td>\n",
       "      <td>702.52</td>\n",
       "      <td>3696.0</td>\n",
       "      <td>408.733369</td>\n",
       "      <td>843.462091</td>\n",
       "      <td>207.15</td>\n",
       "      <td>45.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15362.0</th>\n",
       "      <td>9.95</td>\n",
       "      <td>48.0</td>\n",
       "      <td>3.612000</td>\n",
       "      <td>9.200000</td>\n",
       "      <td>0.21</td>\n",
       "      <td>1.0</td>\n",
       "      <td>144.48</td>\n",
       "      <td>368.0</td>\n",
       "      <td>3.628261</td>\n",
       "      <td>13.117647</td>\n",
       "      <td>...</td>\n",
       "      <td>3.590000</td>\n",
       "      <td>6.304348</td>\n",
       "      <td>61.03</td>\n",
       "      <td>145.0</td>\n",
       "      <td>18.90</td>\n",
       "      <td>60.0</td>\n",
       "      <td>7.218261</td>\n",
       "      <td>19.421995</td>\n",
       "      <td>0.86</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18102.0</th>\n",
       "      <td>3580.80</td>\n",
       "      <td>1008.0</td>\n",
       "      <td>10.831367</td>\n",
       "      <td>175.196629</td>\n",
       "      <td>0.27</td>\n",
       "      <td>-324.0</td>\n",
       "      <td>11567.90</td>\n",
       "      <td>187110.0</td>\n",
       "      <td>3580.800000</td>\n",
       "      <td>624.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.480000</td>\n",
       "      <td>-324.000000</td>\n",
       "      <td>0.48</td>\n",
       "      <td>-432.0</td>\n",
       "      <td>8432.42</td>\n",
       "      <td>35654.0</td>\n",
       "      <td>8048.487489</td>\n",
       "      <td>26429.403963</td>\n",
       "      <td>7858.33</td>\n",
       "      <td>17709.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18087.0</th>\n",
       "      <td>852.80</td>\n",
       "      <td>3906.0</td>\n",
       "      <td>11.971368</td>\n",
       "      <td>78.189474</td>\n",
       "      <td>0.36</td>\n",
       "      <td>-96.0</td>\n",
       "      <td>1137.28</td>\n",
       "      <td>7428.0</td>\n",
       "      <td>852.800000</td>\n",
       "      <td>3906.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.820000</td>\n",
       "      <td>-96.000000</td>\n",
       "      <td>0.82</td>\n",
       "      <td>-288.0</td>\n",
       "      <td>960.41</td>\n",
       "      <td>4648.0</td>\n",
       "      <td>912.976463</td>\n",
       "      <td>4374.814472</td>\n",
       "      <td>883.90</td>\n",
       "      <td>4162.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 34 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             MAX(data.price)  MAX(data.quantity)  MEAN(data.price)  \\\n",
       "customer_id                                                          \n",
       "13085.0               830.12                48.0         12.413587   \n",
       "13078.0                12.75               300.0          3.961193   \n",
       "15362.0                 9.95                48.0          3.612000   \n",
       "18102.0              3580.80              1008.0         10.831367   \n",
       "18087.0               852.80              3906.0         11.971368   \n",
       "\n",
       "             MEAN(data.quantity)  MIN(data.price)  MIN(data.quantity)  \\\n",
       "customer_id                                                             \n",
       "13085.0                 9.076087             0.55               -48.0   \n",
       "13078.0                14.061988             0.19               -14.0   \n",
       "15362.0                 9.200000             0.21                 1.0   \n",
       "18102.0               175.196629             0.27              -324.0   \n",
       "18087.0                78.189474             0.36               -96.0   \n",
       "\n",
       "             SUM(data.price)  SUM(data.quantity)  \\\n",
       "customer_id                                        \n",
       "13085.0              1142.05               835.0   \n",
       "13078.0              3386.82             12023.0   \n",
       "15362.0               144.48               368.0   \n",
       "18102.0             11567.90            187110.0   \n",
       "18087.0              1137.28              7428.0   \n",
       "\n",
       "             MAX(invoices.MEAN(data.price))  \\\n",
       "customer_id                                   \n",
       "13085.0                          830.120000   \n",
       "13078.0                           12.750000   \n",
       "15362.0                            3.628261   \n",
       "18102.0                         3580.800000   \n",
       "18087.0                          852.800000   \n",
       "\n",
       "             MAX(invoices.MEAN(data.quantity))  ...  \\\n",
       "customer_id                                     ...   \n",
       "13085.0                              20.750000  ...   \n",
       "13078.0                              61.333333  ...   \n",
       "15362.0                              13.117647  ...   \n",
       "18102.0                             624.000000  ...   \n",
       "18087.0                            3906.000000  ...   \n",
       "\n",
       "             MIN(invoices.MEAN(data.price))  \\\n",
       "customer_id                                   \n",
       "13085.0                            1.828571   \n",
       "13078.0                            0.190000   \n",
       "15362.0                            3.590000   \n",
       "18102.0                            0.480000   \n",
       "18087.0                            0.820000   \n",
       "\n",
       "             MIN(invoices.MEAN(data.quantity))  MIN(invoices.SUM(data.price))  \\\n",
       "customer_id                                                                     \n",
       "13085.0                             -15.428571                          10.50   \n",
       "13078.0                             -14.000000                           0.19   \n",
       "15362.0                               6.304348                          61.03   \n",
       "18102.0                            -324.000000                           0.48   \n",
       "18087.0                             -96.000000                           0.82   \n",
       "\n",
       "             MIN(invoices.SUM(data.quantity))  SUM(invoices.MAX(data.price))  \\\n",
       "customer_id                                                                    \n",
       "13085.0                                -108.0                         886.42   \n",
       "13078.0                                 -15.0                         702.52   \n",
       "15362.0                                 145.0                          18.90   \n",
       "18102.0                                -432.0                        8432.42   \n",
       "18087.0                                -288.0                         960.41   \n",
       "\n",
       "             SUM(invoices.MAX(data.quantity))  SUM(invoices.MEAN(data.price))  \\\n",
       "customer_id                                                                     \n",
       "13085.0                                 209.0                      860.134136   \n",
       "13078.0                                3696.0                      408.733369   \n",
       "15362.0                                  60.0                        7.218261   \n",
       "18102.0                               35654.0                     8048.487489   \n",
       "18087.0                                4648.0                      912.976463   \n",
       "\n",
       "             SUM(invoices.MEAN(data.quantity))  SUM(invoices.MIN(data.price))  \\\n",
       "customer_id                                                                     \n",
       "13085.0                              78.982882                         839.97   \n",
       "13078.0                             843.462091                         207.15   \n",
       "15362.0                              19.421995                           0.86   \n",
       "18102.0                           26429.403963                        7858.33   \n",
       "18087.0                            4374.814472                         883.90   \n",
       "\n",
       "             SUM(invoices.MIN(data.quantity))  \n",
       "customer_id                                    \n",
       "13085.0                                  -3.0  \n",
       "13078.0                                  45.0  \n",
       "15362.0                                   3.0  \n",
       "18102.0                               17709.0  \n",
       "18087.0                                4162.0  \n",
       "\n",
       "[5 rows x 34 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# new features\n",
    "\n",
    "feature_matrix.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "date_primitives = [\"month\", \"weekday\"]\n",
    "\n",
    "text_primitives = [\"num_words\"]\n",
    "\n",
    "trans_primitives = date_primitives + text_primitives "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "agg_primitives = [\"mean\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x000001E14BEDC310> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
      "  ).agg(to_agg)\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x000001E14BEDC310> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
      "  ).agg(to_agg)\n",
      "C:\\Users\\Sole\\Documents\\Repositories\\envs\\fsml\\lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:785: FutureWarning: The provided callable <function mean at 0x000001E14BEDC310> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
      "  ).agg(to_agg)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[<Feature: MEAN(data.price)>,\n",
       " <Feature: MEAN(data.quantity)>,\n",
       " <Feature: MONTH(first_invoices_time)>,\n",
       " <Feature: WEEKDAY(first_invoices_time)>,\n",
       " <Feature: MEAN(invoices.MEAN(data.price))>,\n",
       " <Feature: MEAN(invoices.MEAN(data.quantity))>,\n",
       " <Feature: MEAN(data.NUM_WORDS(description))>,\n",
       " <Feature: MEAN(invoices.MEAN(data.NUM_WORDS(description)))>]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_matrix, feature_defs = ft.dfs(\n",
    "    entityset=es,                                 # the entity set\n",
    "    target_dataframe_name=\"customers\",            # the dataframe for wich to create the feature\n",
    "    agg_primitives=agg_primitives,                # the aggregation primitives\n",
    "    trans_primitives=trans_primitives,            # the operation to create the new features\n",
    "    max_depth=3,\n",
    ")\n",
    "\n",
    "# display name of created features\n",
    "feature_defs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MEAN(data.price)</th>\n",
       "      <th>MEAN(data.quantity)</th>\n",
       "      <th>MONTH(first_invoices_time)</th>\n",
       "      <th>WEEKDAY(first_invoices_time)</th>\n",
       "      <th>MEAN(invoices.MEAN(data.price))</th>\n",
       "      <th>MEAN(invoices.MEAN(data.quantity))</th>\n",
       "      <th>MEAN(data.NUM_WORDS(description))</th>\n",
       "      <th>MEAN(invoices.MEAN(data.NUM_WORDS(description)))</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>customer_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>13085.0</th>\n",
       "      <td>12.413587</td>\n",
       "      <td>9.076087</td>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "      <td>86.013414</td>\n",
       "      <td>7.898288</td>\n",
       "      <td>4.239130</td>\n",
       "      <td>3.871398</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13078.0</th>\n",
       "      <td>3.961193</td>\n",
       "      <td>14.061988</td>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "      <td>4.302457</td>\n",
       "      <td>8.878548</td>\n",
       "      <td>4.309942</td>\n",
       "      <td>4.342485</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15362.0</th>\n",
       "      <td>3.612000</td>\n",
       "      <td>9.200000</td>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "      <td>3.609130</td>\n",
       "      <td>9.710997</td>\n",
       "      <td>4.225000</td>\n",
       "      <td>4.264706</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18102.0</th>\n",
       "      <td>10.831367</td>\n",
       "      <td>175.196629</td>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "      <td>52.604493</td>\n",
       "      <td>172.741202</td>\n",
       "      <td>4.336142</td>\n",
       "      <td>4.205888</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18087.0</th>\n",
       "      <td>11.971368</td>\n",
       "      <td>78.189474</td>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "      <td>43.475070</td>\n",
       "      <td>208.324499</td>\n",
       "      <td>4.252632</td>\n",
       "      <td>3.975132</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             MEAN(data.price)  MEAN(data.quantity) MONTH(first_invoices_time)  \\\n",
       "customer_id                                                                     \n",
       "13085.0             12.413587             9.076087                         12   \n",
       "13078.0              3.961193            14.061988                         12   \n",
       "15362.0              3.612000             9.200000                         12   \n",
       "18102.0             10.831367           175.196629                         12   \n",
       "18087.0             11.971368            78.189474                         12   \n",
       "\n",
       "            WEEKDAY(first_invoices_time)  MEAN(invoices.MEAN(data.price))  \\\n",
       "customer_id                                                                 \n",
       "13085.0                                1                        86.013414   \n",
       "13078.0                                1                         4.302457   \n",
       "15362.0                                1                         3.609130   \n",
       "18102.0                                1                        52.604493   \n",
       "18087.0                                1                        43.475070   \n",
       "\n",
       "             MEAN(invoices.MEAN(data.quantity))  \\\n",
       "customer_id                                       \n",
       "13085.0                                7.898288   \n",
       "13078.0                                8.878548   \n",
       "15362.0                                9.710997   \n",
       "18102.0                              172.741202   \n",
       "18087.0                              208.324499   \n",
       "\n",
       "             MEAN(data.NUM_WORDS(description))  \\\n",
       "customer_id                                      \n",
       "13085.0                               4.239130   \n",
       "13078.0                               4.309942   \n",
       "15362.0                               4.225000   \n",
       "18102.0                               4.336142   \n",
       "18087.0                               4.252632   \n",
       "\n",
       "             MEAN(invoices.MEAN(data.NUM_WORDS(description)))  \n",
       "customer_id                                                    \n",
       "13085.0                                              3.871398  \n",
       "13078.0                                              4.342485  \n",
       "15362.0                                              4.264706  \n",
       "18102.0                                              4.205888  \n",
       "18087.0                                              3.975132  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_matrix.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "fsml",
   "language": "python",
   "name": "fsml"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "165px"
   },
   "toc_section_display": "block",
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
